Data Cleaning

#import data
library(readr)
games_data <- read_csv("vgsales-12-4-2019-short.csv", 
    col_types = cols(Rank = col_integer())) #change Rank variable to integer class

#change different variables to factor class
games_data$Genre <- as.factor(games_data$Genre)
games_data$ESRB_Rating <- as.factor(games_data$ESRB_Rating)
games_data$Platform <- as.factor(games_data$Platform)
games_data$Publisher <- as.factor(games_data$Publisher)
games_data$Developer <- as.factor(games_data$Developer)
games_data$Year <- as.factor(games_data$Year)

#rename the levels of the ESRB Rating variable and order them
#KA (Kids to Adults) rating was changed to Everyone
library(plyr)
games_data$ESRB_Rating <- revalue(games_data$ESRB_Rating, c("E"="Everyone", "E10"="Everyone10+", "T"="Teen", "M"="Mature", "AO" = "AdultsOnly", "RP"="RatingPending", "KA"="Everyone", "EC"="EarlyChildhood"))
games_data$ESRB_Rating <- ordered(games_data$ESRB_Rating, levels = c("EarlyChildhood", "Everyone", "Everyone10+", "Teen", "Mature", "RatingPending", "AdultsOnly"))

#Change levels of Publisher and Developer variables called Unknown to missing
levels(games_data$Publisher)[levels(games_data$Publisher)=='Unknown'] <- NA
levels(games_data$Developer)[levels(games_data$Developer)=='Unknown'] <- NA

#Rename a couple Publisher and Developer levels
games_data$Publisher <- revalue(games_data$Publisher, 
                c("Microsoft Game Studios"="Microsoft Studios", 
                  "Valve"="Valve Corporation", "Valve Software"="Valve Corporation", #all Valve Corporation
                  "Sony Computer Entertainment"="Sony Interactive Entertainment")) #all Sony Interactive Entertainment
games_data$Developer <- revalue(games_data$Developer, c("Microsoft Game Studios"="Microsoft Studios", "Valve"="Valve Corporation", "Valve Software"="Valve Corporation", "Sony Computer Entertainment"="Sony Interactive Entertainment"))

#Rename the sub-genres to their overall Genre
games_data$Genre <- revalue(games_data$Genre, c("Platform"="Action", "Shooter"="Action", "Fighting"="Action", "Visual Novel"="Adventure", "Racing"="Sports"))

#order genres by similarities for graphing
games_data$Genre <- ordered(games_data$Genre, levels = c("Action", "Action-Adventure", "Adventure", "Role-Playing", "Simulation", "Strategy", "Sports", "Board Game", "Puzzle", "Education", "Music", "MMO", "Party", "Misc"))

attach(games_data)
summary(games_data)
##       Rank           Name                    Genre              ESRB_Rating   
##  Min.   :    1   Length:55792       Action      :17783   Everyone     :10814  
##  1st Qu.:13949   Class :character   Misc        : 9476   Teen         : 6157  
##  Median :27896   Mode  :character   Sports      : 8274   Mature       : 3314  
##  Mean   :27896                      Adventure   : 5553   Everyone10+  : 2897  
##  3rd Qu.:41844                      Role-Playing: 4551   RatingPending:  368  
##  Max.   :55792                      (Other)     :10146   (Other)      :   73  
##                                     NA's        :    9   NA's         :32169  
##     Platform               Publisher               Developer    
##  PC     :10978   Sega           : 2085   Konami         :  911  
##  PS2    : 3564   Activision     : 1519   Sega           :  817  
##  DS     : 3292   Ubisoft        : 1519   Capcom         :  684  
##  PS     : 2703   Electronic Arts: 1498   Namco          :  425  
##  XBL    : 2115   Konami         : 1495   SNK Corporation:  403  
##  PSN    : 2004   (Other)        :42785   (Other)        :47779  
##  (Other):31136   NA's           : 4891   NA's           : 4773  
##   Critic_Score     User_Score    Total_Shipped    Global_Sales  
##  Min.   : 1.00   Min.   : 2.00   Min.   : 0.03   Min.   : 0.00  
##  1st Qu.: 6.40   1st Qu.: 7.80   1st Qu.: 0.20   1st Qu.: 0.03  
##  Median : 7.50   Median : 8.50   Median : 0.59   Median : 0.12  
##  Mean   : 7.21   Mean   : 8.25   Mean   : 1.89   Mean   : 0.37  
##  3rd Qu.: 8.30   3rd Qu.: 9.10   3rd Qu.: 1.80   3rd Qu.: 0.36  
##  Max.   :10.00   Max.   :10.00   Max.   :82.86   Max.   :20.32  
##  NA's   :49256   NA's   :55457   NA's   :53965   NA's   :36377  
##     NA_Sales       PAL_Sales        JP_Sales      Other_Sales   
##  Min.   :0.00    Min.   :0.00    Min.   :0.00    Min.   :0.00   
##  1st Qu.:0.05    1st Qu.:0.01    1st Qu.:0.02    1st Qu.:0.00   
##  Median :0.12    Median :0.04    Median :0.05    Median :0.01   
##  Mean   :0.28    Mean   :0.16    Mean   :0.11    Mean   :0.04   
##  3rd Qu.:0.29    3rd Qu.:0.14    3rd Qu.:0.12    3rd Qu.:0.04   
##  Max.   :9.76    Max.   :9.85    Max.   :2.69    Max.   :3.12   
##  NA's   :42828   NA's   :42603   NA's   :48749   NA's   :40270  
##       Year      
##  2009   : 4507  
##  2010   : 3661  
##  2011   : 3489  
##  2008   : 2979  
##  2014   : 2905  
##  (Other):37272  
##  NA's   :  979

Genre vs. ESRB Rating

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#create subset where missing values in Genre and ESRB_Rating are omitted
data1 <- games_data[complete.cases(games_data$Genre, games_data$ESRB_Rating),] #23619 observations

#plot bar chart of Genre stacked by Rating
plot1 <- ggplot(data1, aes(x=Genre, fill=ESRB_Rating, stat="count"))+
  xlab("Genre")+
  ylab("Count")+
  ggtitle("Video Game Genres vs. ESRB Rating")+
  geom_bar(width=0.75)+
  theme(axis.text.x = element_text(angle = 85, vjust=0.5),
        plot.title = element_text(hjust=0.5))

plot1

Top 100 Ranked Video Games

#Create subset with top 100 games
top100 <- games_data[games_data$Rank < 101,]

#Top 100 ESRB Ratings
rating_100 <- ggplot(data=subset(top100, !is.na(ESRB_Rating)), aes(x=ESRB_Rating, fill=ESRB_Rating, stat="count"))+
  xlab("ESRB Rating")+
  ylab("Number of Games")+
  ggtitle("Top 100 Video Game Ratings")+
  geom_text(stat="count",aes(label=..count..), vjust = -.5) +
  theme(legend.position = "none", axis.text = element_text(size=12),
        plot.title = element_text(hjust=0.5))+
  geom_bar(width=0.75)

#Top 100 Genres
genre_100 <- ggplot(top100, aes(x=Genre, stat="count"))+
  xlab("Genre")+
  ylab("Number of Games")+
  ggtitle("Top 100 Video Game Genres")+
  geom_bar(width=0.75, fill="#F8766D")+
  geom_text(stat="count",aes(label=..count..), vjust = 0) +
  theme(legend.position = "none",axis.text.x = element_text(angle = 85, vjust=0.95, hjust = 0.95),
        plot.title = element_text(hjust=0.5))

#Top 100 Years
year_100 <- ggplot(top100, aes(x=Year, stat="count"))+
  xlab("Year")+
  ylab("Number of Games")+
  ggtitle("Top 100 Video Game Years")+
  geom_bar(width=0.75, fill="#C77CFF")+
  geom_text(stat="count",aes(label=..count..), vjust = -.5) +
  theme(legend.position = "none",axis.text.x = element_text(angle = 85, vjust=0.5),
        plot.title = element_text(hjust=0.5))

#Top 100 Publishers
pub_100 <- ggplot(data=subset(top100, !is.na(Publisher)), aes(x=Publisher, stat="count"))+
  xlab("Publisher")+
  ylab("Number of Games")+
  ggtitle("Top 100 Video Game Publisher")+
  geom_bar(width=0.75, fill="#00BFC4")+
  geom_text(stat="count",aes(label=..count..), vjust = 0) +
  theme(legend.position = "none",axis.text.x = element_text(angle = 85, vjust=0.95, hjust = 0.95),
        plot.title = element_text(hjust=0.5))

#Top 100 Critic Scores
critic_100 <- ggplot(data=subset(top100, !is.na(Critic_Score)))+ #without missing values
  geom_histogram(mapping=aes(x=Critic_Score), binwidth=0.1, fill="#7CAE00", colour="grey")+
  xlab("Critic Score")+
  ylab("Number of Games")+
  ggtitle("Top 100 Video Game Critic Scores")+
  theme(plot.title = element_text(hjust=0.5))

rating_100

genre_100

year_100

pub_100

critic_100

Critic Score vs. User Score

#create dataset without missing values in Critic or User score
data2 <- games_data[complete.cases(games_data$Critic_Score, games_data$User_Score),] #218 observations

#plot Critic score vs. User Score colored by ESRB Rating in plotly
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot3 <- plot_ly(data=data2, x = ~Critic_Score, y = ~User_Score, color = ~ESRB_Rating, type="scatter", mode = "markers")
plot3 <- plot3 %>% layout(title = "Critic Score vs. User Score",
                          xaxis = list(title = "Critic Score"),
                          yaxis = list(title = "User Score"))

plot3
#correlation between Critic and User Scores
cor(data2$Critic_Score, data2$User_Score)
## [1] 0.5826729

Critic Score and User Score vs. Sales

#create column in data that combines Total_Shipped and Global_Sales
#every observation with missing values in Total_shipped has value in Global_Sales and vice versa.
#Rank is by Total_Shipped and when there is a missing value, it goes by Global_Sales
games_data$total <- coalesce(games_data$Total_Shipped, games_data$Global_Sales)

#create subset of data without missing data in Critic_Score and total sales
data3 <- games_data[complete.cases(games_data$Critic_Score, games_data$total),] #4861 observations
#create subset of data without missing data in User_Score and total sales
data4 <- games_data[complete.cases(games_data$User_Score, games_data$total),] #242 observations

#plot Critic Score vs Total Sales, colored by ESRB Rating
plot4 <- plot_ly(data=data3, x = ~Critic_Score, y = ~total, color = ~ESRB_Rating, type="scatter", mode = "markers")
plot4 <- plot4 %>% layout(title = "Critic Score vs. Total Sales",
                          xaxis = list(title = "Critic Score"),
                          yaxis = list(title = "Total Sales (Millions)"))

#plot Critic Score vs Total Sales, colored by ESRB Rating
plot5 <- plot_ly(data=data4, x = ~User_Score, y = ~total, color = ~ESRB_Rating, type="scatter", mode = "markers")
plot5 <- plot5 %>% layout(title = "User Score vs. Total Sales",
                          xaxis = list(title = "User Score"),
                          yaxis = list(title = "Total Sales (Millions)"))

plot4
#correlation of Critic Score and Total Sales
cor(data3$Critic_Score, data3$total)
## [1] 0.2595252
plot5
#correlation of User Score and Total Sales
cor(data4$User_Score, data4$total)
## [1] 0.160098